Setting up
First, load the necessary libraries:
library(reticulate) # for R - Python usage
library(grf) # for the regression forest
library(np) # for kernel smoothing methods
library(tidyverse)
library(foreach) # needed for faster implementation of my own NW-estimator
library(doParallel) # needed for faster implementation of my own NW-estimator
library(xgboost)
source("xgboost_smoother.R")
Do the same for Python, the reticulate package usually runs stuff in
a particular environment. In order to be safe and have all necessary
packages installed, run this code:
virtualenv_create("r-reticulate")
virtualenv: r-reticulate
use_virtualenv("r-reticulate", required = TRUE)
py_config()
python: /Users/henripfleiderer/.virtualenvs/r-reticulate/bin/python
libpython: /opt/homebrew/opt/python@3.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/config-3.11-darwin/libpython3.11.dylib
pythonhome: /Users/henripfleiderer/.virtualenvs/r-reticulate:/Users/henripfleiderer/.virtualenvs/r-reticulate
version: 3.11.6 (main, Oct 2 2023, 13:45:54) [Clang 15.0.0 (clang-1500.0.40.1)]
numpy: /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages/numpy
numpy_version: 1.26.4
NOTE: Python version was forced by use_python() function
#use_python("/Users/henripfleiderer/anaconda3/bin/python", required = TRUE)
virtualenv_install(packages = c("numpy==1.26.4","scipy", "scikit-learn","matplotlib","pandas"))
Using virtual environment '~/.virtualenvs/r-reticulate' ...
+ /Users/henripfleiderer/.virtualenvs/r-reticulate/bin/python -m pip install --upgrade --no-user 'numpy==1.26.4' scipy scikit-learn matplotlib pandas
Requirement already satisfied: numpy==1.26.4 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (1.26.4)
Requirement already satisfied: scipy in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (1.13.1)
Requirement already satisfied: scikit-learn in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (1.5.0)
Requirement already satisfied: matplotlib in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (3.9.0)
Requirement already satisfied: pandas in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (2.2.2)
Requirement already satisfied: joblib>=1.2.0 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from scikit-learn) (1.4.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from scikit-learn) (3.4.0)
Requirement already satisfied: contourpy>=1.0.1 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (1.2.1)
Requirement already satisfied: cycler>=0.10 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (1.4.5)
Requirement already satisfied: packaging>=20.0 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (24.0)
Requirement already satisfied: pillow>=8 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (10.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from matplotlib) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from pandas) (2024.1)
Requirement already satisfied: six>=1.5 in /Users/henripfleiderer/.virtualenvs/r-reticulate/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
#py_config()
Now, import the necessary python packages
reticulate::repl_python()
import numpy as np
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics.pairwise import pairwise_kernels # to manually compute the kernel function
from sklearn.model_selection import GridSearchCV
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.gaussian_process.kernels import RBF # matern kernel but for GP
from sklearn.gaussian_process.kernels import Matern # matern kernel but for GP
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
from scipy.stats import uniform
Functions for NW smooting in R:
Gaussian kernel, for two input vectors and one bandwidth, common
across all dimensions:
quit
gaussian_kernel = function(x1,x2,h, order = 2){
x1 = matrix(x1,ncol = 1)
x2 = matrix(x2,ncol = 1)
d = dim(x1)[1]
u = sqrt(t(x1-x2)%*%(x1-x2))/h
const = (2*pi)^(1/2)
if(order==2){
k = exp(-u^2/2)/const
} else if(order==4){
k = (3/2 - 1/2*u^2)*exp(-u^2/2)/const
} else if (order==6){
k = (15/8 - 5/4*u^2 + 1/8 * u^4)*exp(-u^2/2)/const
}
return(k)
}
A function for fitting a NW-estimator. With given bandwidth. Uses
parallel computing:
fit_kern_Reg_parallel = function(x_test, x_train, y_train, h,...) {
# Use foreach with .export to ensure functions are available in the parallel environment
# Use foreach to parallelize the outer loop -> this makes it faster
n = length(x_train[, 1])
H_mat = foreach(i = seq_along(x_test[, 1]), .combine = "rbind", .packages = c("base"), .export = c("gaussian_kernel")) %dopar% {
K_vec = numeric(n)
# Inner loop
for (j in seq_along(x_train[, 1])) {
K_vec[j] = gaussian_kernel(x_test[i, ], x_train[j, ], h,...)
}
# Return row for H_mat
K_vec / sum(K_vec)
}
fhat = H_mat%*%y_train
results = list(
predictions = fhat,
H = H_mat
)
return(results) # Return predictions
}
A function that computes the Generalized Cross-Validation (GCV)
objective, also uses parallel computing. As proposed in Racine (2019):
GCV_kern_smooth_parallel = function(x_train, y_train, h,...) {
n = length(x_train[, 1])
# Use foreach to parallelize the outer loop -> this makes it faster
H_mat = foreach(i = seq_along(x_train[, 1]), .combine = "rbind", .packages = c("base"), .export = c("gaussian_kernel")) %dopar% {
K_vec = numeric(n)
# Inner loop
for (j in seq_along(x_train[, 1])) {
K_vec[j] = gaussian_kernel(x_train[i, ], x_train[j, ], h,...)
}
# Return row for H_mat
K_vec / sum(K_vec)
}
# Compute trace and GCV
trace = sum(diag(diag(n) - H_mat))
GCV = (trace / n)^(-2) * (1 / n) * sum((y_train - H_mat %*% y_train)^2)
return(GCV)
}
A function that trains the model (optimizes bandwidth) and gives
predictions on test points:
fit_kern_Reg_GCV_parallel = function(x_test,x_train,y_train,h_min=0.1,h_max = 2,...){
# x_eval -> a n_test x d-dimensional matrix of points to evaluate the function:
# x_train -> training points, x values
# y_train -> training points, y values
# h_min -> min value for bandwidth
# h_max -> max value for bandwidth
# set up parallelization:
n_cores = detectCores() - 1
# Set up parallel cluster
cl = makeCluster(n_cores)
registerDoParallel(cl)
# first, find the optimal bandwidth:
result = optimize(function(h) GCV_kern_smooth_parallel(x_train, y_train, h,...),
interval = c(h_min, h_max),
tol = 1e-3)
# Extract optimal bandwidth
optimal_h = result$minimum
model_fit = fit_kern_Reg_parallel(x_test,x_train,y_train,optimal_h,...)
fhat = model_fit$predictions
optimal_H_mat = model_fit$H
stopCluster(cl)
result = list(
predictions = fhat,
h_opt = optimal_h,
H_opt = optimal_H_mat
)
return(result)
}
A function with a jump
Create a multivariate non-smooth function to check the curse of
dimensionality.
y_function = function(x){
y = 0.5*(x>-1&x<2) + 0.5 + 0.1*(x>=2) #+ cos(-x%*%b)
return(y)
}
Draw and visualize a realization for \(p =
1\):
set.seed(1234)
p = 1
X = matrix(seq(-5,5,0.01),ncol = 1)
y = y_function(X) + rnorm(dim(X)[1],sd = 0.2)
y_true = y_function(X)
tibble(X,y) %>%
ggplot(aes(x = X, y = y)) +
geom_point(alpha = 0.2)+
geom_line(aes(x = X, y = y_true), color = "black")+
theme_minimal()

Fit the function:
Kernel Ridge Regression:
Fit by KRR:
reticulate::repl_python()
param_distributions_Gaussian = {
"alpha": uniform(1e-9, 1),
"kernel__length_scale": uniform(1e-9, 1.5),
}
KRR_CV_Gaussian = RandomizedSearchCV(
KernelRidge(kernel = RBF()),
param_distributions=param_distributions_Gaussian,
n_iter=500,
n_jobs=-1
)
_ = KRR_CV_Gaussian.fit(r.X,r.y) # fit the kernel ridge regression using cross.validated lambda, suppresses output
#KRR_CV_Gaussian.best_params_
y_hat_KRR_Gaussian = KRR_CV_Gaussian.predict(r.X)
K = RBF(length_scale = KRR_CV_Gaussian.best_params_['kernel__length_scale'],length_scale_bounds = "fixed")
K_mat = K(r.X)
inv = np.linalg.inv(K_mat + KRR_CV_Gaussian.best_params_['alpha']*np.eye(K_mat.shape[0]))
weight_matrix_Gaussian = K_mat@ inv # weight matrix
Visualize:
quit
y_hat_KRR_Gaussian = as.numeric(py$y_hat_KRR_Gaussian)
tibble(value = c(y_true,y_hat_KRR_Gaussian),
X_grid = rep(X,2),
Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1])))) %>%
ggplot(aes(x = rep(X,2), y = rep(y,2))) +
geom_point(alpha = 0.2)+
geom_line(aes(x = X_grid, y = value, color = Method))+
scale_color_manual(values = c("True Function" = "black", "KRR" = "red")) +
theme_minimal()

Regression Forest:
reg_forest = regression_forest(X,y, tune.parameters = "all")
y_hat_rf = predict(reg_forest, X)$predictions
tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf),
X_grid = rep(X,3),
Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1])))) %>%
ggplot(aes(x = rep(X,3), y = rep(y,3))) +
geom_point(alpha = 0.1)+
geom_line(aes(x = X_grid, y = value, color = Method))+
scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green")) +
theme_minimal()

Nadayara-Watson:
fit_model = fit_kern_Reg_GCV_parallel(X,X,y,0.01,1)
y_hat_np = fit_model$predictions
tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np),
X_grid = rep(X,4),
Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) ))) %>%
ggplot(aes(x = rep(X,4), y = rep(y,4))) +
geom_point(alpha = 0.07)+
geom_line(aes(x = X_grid, y = value, color = Method))+
scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple")) +
theme_minimal()+
labs(x = "x", y = "y")

XG Boost
Set up:
# Convert to DMatrix object
dtrain = xgb.DMatrix(data = as.matrix(X), label = y)
#dtest = xgb.DMatrix(data = as.matrix(test_features), label = test_targets)
# Define model parameters
params = list(
booster = "gbtree",
objective = "reg:squarederror",
eta = 0.85, # Equivalent to learning_rate
max_depth = 6, # Need to specify a value as XGBoost requires a numerical value
min_child_weight = 100, # Not a direct equivalent but serves to control over-fitting
subsample = 1,
colsample_bytree = 1, # Equivalent to 'sqrt' in max_features
# Note: XGBoost does not have a direct equivalent for 'max_leaf_nodes' and 'init'
lambda = 100
)
# Number of boosting rounds (equivalent to n_estimators)
nrounds = 50
# Train the model
model = xgb.train(params = params, data = dtrain, nrounds = nrounds)
Get predictions and smoother matrix:
leaf_indices_train = predict(model, dtrain, predleaf = TRUE)
smoother_train_XG = create_S_from_gbtregressor(model,leaf_indices_train,output_dir,save_output = FALSE)
Plot the prediction:
y_hat_xg=predict(model, dtrain)
tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np,y_hat_xg),
X_grid = rep(X,5),
Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) , rep("XGBoost", dim(X)[1]) ))) %>%
ggplot(aes(x = rep(X,5), y = rep(y,5))) +
geom_point(alpha = 0.03)+
geom_line(aes(x = X_grid, y = value, color = Method))+
scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple", "XGBoost" = "orange")) +
theme_minimal()

Equivalent kernels:
Set some points where we want the equivalent kernels:
points = c(-4,-3,-1.05,0.01,1.95,4)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(X,y) %>%
ggplot(aes(x = X, y = y)) +
geom_point(alpha = 0.2)+
geom_line(aes(x = X, y = y_true), color = "black")
for (i in 1:length(points)){
plot = plot+geom_vline(xintercept = points[i], linetype = "dashed")
}
plot + theme_minimal()

Visualize KRR
Visualize for KRR:
results_weights_KRR = as_tibble(results_weights_KRR)
results_weights_KRR = results_weights_KRR %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_KRR %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Boundary
Plot:
results_weights_KRR_boundary = as_tibble(results_weights_KRR_boundary)
results_weights_KRR_boundary = results_weights_KRR_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_KRR_boundary %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Visualize RF
Visualize for regression forest:
results_weights_RF = as_tibble(results_weights_RF)
results_weights_RF = results_weights_RF %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_RF %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Boundary
results_weights_RF_boundary = as_tibble(results_weights_RF_boundary)
results_weights_RF_boundary = results_weights_RF_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_RF_boundary %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Visualize NW:
results_weights_NW = as_tibble(results_weights_NW)
results_weights_NW = results_weights_NW %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_NW %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Boundary
results_weights_NW_boundary = as_tibble(results_weights_NW_boundary)
results_weights_NW_boundary = results_weights_NW_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_NW_boundary %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Visualize XG
results_weights_XG = as_tibble(results_weights_XG)
results_weights_XG = results_weights_XG %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_XG %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Boundary
results_weights_XG_boundary = as_tibble(results_weights_XG_boundary)
results_weights_XG_boundary = results_weights_XG_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_XG_boundary %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

New Function, just a line
Create a “simple” function:
y_function = function(x){
y = 0.5*x
return(y)
}
Draw and visualize a realization for \(p =
1\). Observations scattered unevenly along the \(x\)-axis:
set.seed(1234)
p = 1
X = matrix(c(runif(500,-5,-2),runif(50,-2,1),runif(1000,1,3),runif(100,3,5)),ncol = 1)
y = y_function(X) + rnorm(dim(X)[1],sd = 0.2)
y_true = y_function(X)
tibble(X,y) %>%
ggplot(aes(x = X, y = y)) +
geom_point(alpha = 0.2)+
geom_line(aes(x = X, y = y_true), color = "black")+
theme_minimal()

Fit the function:
Kernel Ridge Regression:
Fit by KRR:
reticulate::repl_python()
param_distributions_Gaussian = {
"alpha": uniform(1e-9, 1),
"kernel__length_scale": uniform(1e-9, 1.5),
}
KRR_CV_Gaussian = RandomizedSearchCV(
KernelRidge(kernel = RBF()),
param_distributions=param_distributions_Gaussian,
n_iter=500,
n_jobs=-1
)
_ = KRR_CV_Gaussian.fit(r.X,r.y) # fit the kernel ridge regression using cross.validated lambda, suppresses output
#KRR_CV_Gaussian.best_params_
y_hat_KRR_Gaussian = KRR_CV_Gaussian.predict(r.X)
K = RBF(length_scale = KRR_CV_Gaussian.best_params_['kernel__length_scale'],length_scale_bounds = "fixed")
K_mat = K(r.X)
inv = np.linalg.inv(K_mat + KRR_CV_Gaussian.best_params_['alpha']*np.eye(K_mat.shape[0]))
weight_matrix_Gaussian = K_mat@ inv # weight matrix
Visualize:
quit
y_hat_KRR_Gaussian = as.numeric(py$y_hat_KRR_Gaussian)
tibble(value = c(y_true,y_hat_KRR_Gaussian),
X_grid = rep(X,2),
Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1])))) %>%
ggplot(aes(x = rep(X,2), y = rep(y,2))) +
geom_point(alpha = 0.2)+
geom_line(aes(x = X_grid, y = value, color = Method))+
scale_color_manual(values = c("True Function" = "black", "KRR" = "red")) +
theme_minimal()

Regression Forest:
reg_forest = regression_forest(X,y, tune.parameters = "all")
y_hat_rf = predict(reg_forest, X)$predictions
tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf),
X_grid = rep(X,3),
Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1])))) %>%
ggplot(aes(x = rep(X,3), y = rep(y,3))) +
geom_point(alpha = 0.1)+
geom_line(aes(x = X_grid, y = value, color = Method))+
scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green")) +
theme_minimal()

Nadayara-Watson
fit_model = fit_kern_Reg_GCV_parallel(X,X,y,0.01,1)
y_hat_np = fit_model$predictions
tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np),
X_grid = rep(X,4),
Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) ))) %>%
ggplot(aes(x = rep(X,4), y = rep(y,4))) +
geom_point(alpha = 0.07)+
geom_line(aes(x = X_grid, y = value, color = Method))+
scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple")) +
theme_minimal()

XG Boost
Set up:
# Convert to DMatrix object
dtrain = xgb.DMatrix(data = as.matrix(X), label = y)
#dtest = xgb.DMatrix(data = as.matrix(test_features), label = test_targets)
# Define model parameters
params = list(
booster = "gbtree",
objective = "reg:squarederror",
eta = 0.85, # Equivalent to learning_rate
max_depth = 6, # Need to specify a value as XGBoost requires a numerical value
min_child_weight = 50, # Not a direct equivalent but serves to control over-fitting
subsample = 1,
colsample_bytree = 1, # Equivalent to 'sqrt' in max_features
# Note: XGBoost does not have a direct equivalent for 'max_leaf_nodes' and 'init'
lambda = 1
)
# Number of boosting rounds (equivalent to n_estimators)
nrounds = 50
# Train the model
model = xgb.train(params = params, data = dtrain, nrounds = nrounds)
Get predictions and smoother matrix:
leaf_indices_train = predict(model, dtrain, predleaf = TRUE)
smoother_train_XG = create_S_from_gbtregressor(model,leaf_indices_train,output_dir,save_output = FALSE)
Plot the prediction:
y_hat_xg=predict(model, dtrain)
tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np,y_hat_xg),
X_grid = rep(X,5),
Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) , rep("XGBoost", dim(X)[1]) ))) %>%
ggplot(aes(x = rep(X,5), y = rep(y,5))) +
geom_point(alpha = 0.03)+
geom_line(aes(x = X_grid, y = value, color = Method))+
scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple", "XGBoost" = "orange")) +
theme_minimal()

Equivalent kernels
Set some points where we want the equivalent kernels:
points = quantile(X,probs = c(0.1,0.2,0.31,0.32,0.35,0.7,0.98), type = 1)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(X,y) %>%
ggplot(aes(x = X, y = y)) +
geom_point(alpha = 0.2)+
geom_line(aes(x = X, y = y_true), color = "black")
for (i in 1:length(points)){
plot = plot+geom_vline(xintercept = points[i], linetype = "dashed")
}
plot + theme_minimal()

Visualize KRR
Visualize for KRR:
results_weights_KRR = as_tibble(results_weights_KRR)
results_weights_KRR = results_weights_KRR %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_KRR %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Visualize RF
Visualize for KRR:
results_weights_RF = as_tibble(results_weights_RF)
results_weights_RF = results_weights_RF %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_RF %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Visualize NW
results_weights_NW = as_tibble(results_weights_NW)
results_weights_NW = results_weights_NW %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_NW %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

Visualize XG
results_weights_XG = as_tibble(results_weights_XG)
results_weights_XG = results_weights_XG %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))
results_weights_XG %>%
ggplot(aes(x = X, y = weight, color = x))+
geom_line()+
theme_minimal()

References
Racine, Jeffrey S. 2019.
An Introduction to the
Advanced Theory and Practice of
Nonparametric Econometrics: A
Replicable Approach Using
R. Cambridge: Cambridge University Press.
https://doi.org/10.1017/9781108649841.
---
title: "Equivalent Kernel - function with jumps/cut-off points"
output: html_notebook
bibliography: references.bib
---

# Setting up

First, load the necessary libraries:

```{r packages,warning = FALSE, message=FALSE, results = 'hide'}
library(reticulate) # for R - Python usage
library(grf) # for the regression forest
library(np) # for kernel smoothing methods
library(tidyverse)
library(foreach) # needed for faster implementation of my own NW-estimator
library(doParallel) # needed for faster implementation of my own NW-estimator
library(xgboost)
source("xgboost_smoother.R")
```

Do the same for Python, the reticulate package usually runs stuff in a particular environment. In order to be safe and have all necessary packages installed, run this code:


```{r, message = FALSE, warning=FALSE}
virtualenv_create("r-reticulate")
use_virtualenv("r-reticulate", required = TRUE)
py_config()
#use_python("/Users/henripfleiderer/anaconda3/bin/python", required = TRUE)
virtualenv_install(packages = c("numpy==1.26.4","scipy", "scikit-learn","matplotlib","pandas"))
#py_config()
```

Now, import the necessary python packages 
```{python}
import numpy as np

from sklearn.kernel_ridge import KernelRidge

from sklearn.metrics.pairwise import pairwise_kernels # to manually compute the kernel function

from sklearn.model_selection import GridSearchCV

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd

from sklearn.gaussian_process.kernels import RBF # matern kernel but for GP 

from sklearn.gaussian_process.kernels import Matern # matern kernel but for GP 

from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import loguniform

from scipy.stats import uniform
```

# Functions for NW smooting in R:

Gaussian kernel, for two input vectors and one bandwidth, common across all dimensions:
```{r}
gaussian_kernel = function(x1,x2,h, order = 2){
  
  x1 = matrix(x1,ncol = 1)
  x2 = matrix(x2,ncol = 1)
  
  d = dim(x1)[1]
  
  u = sqrt(t(x1-x2)%*%(x1-x2))/h
  
  const = (2*pi)^(1/2)
  
  if(order==2){
  k = exp(-u^2/2)/const
  } else if(order==4){
    k = (3/2 - 1/2*u^2)*exp(-u^2/2)/const
  } else if (order==6){
    k = (15/8 - 5/4*u^2 + 1/8 * u^4)*exp(-u^2/2)/const
  }
  
  return(k)
}
```

A function for fitting a NW-estimator. With given bandwidth. Uses parallel computing:

```{r}
fit_kern_Reg_parallel = function(x_test, x_train, y_train, h,...) {
  # Use foreach with .export to ensure functions are available in the parallel environment
  # Use foreach to parallelize the outer loop -> this makes it faster
  n = length(x_train[, 1])
  H_mat = foreach(i = seq_along(x_test[, 1]), .combine = "rbind", .packages = c("base"), .export = c("gaussian_kernel")) %dopar% {
    K_vec = numeric(n)
    
    # Inner loop
    for (j in seq_along(x_train[, 1])) {
      K_vec[j] = gaussian_kernel(x_test[i, ], x_train[j, ], h,...)
    }
    
    # Return row for H_mat
    K_vec / sum(K_vec)
  }
  
  fhat = H_mat%*%y_train
  
  results = list(
    predictions = fhat,
    H = H_mat
  )
  return(results)  # Return predictions
}

```

A function that computes the Generalized Cross-Validation (GCV) objective, also uses parallel computing. As proposed in @racine_introduction_2019:

```{r}
GCV_kern_smooth_parallel = function(x_train, y_train, h,...) {
  n = length(x_train[, 1])
  
  # Use foreach to parallelize the outer loop -> this makes it faster
  H_mat = foreach(i = seq_along(x_train[, 1]), .combine = "rbind", .packages = c("base"), .export = c("gaussian_kernel")) %dopar% {
    K_vec = numeric(n)
    
    # Inner loop
    for (j in seq_along(x_train[, 1])) {
      K_vec[j] = gaussian_kernel(x_train[i, ], x_train[j, ], h,...)
    }
    
    # Return row for H_mat
    K_vec / sum(K_vec)
  }
  
  # Compute trace and GCV
  trace = sum(diag(diag(n) - H_mat))
  GCV = (trace / n)^(-2) * (1 / n) * sum((y_train - H_mat %*% y_train)^2)
  
  return(GCV)
}
```

A function that trains the model (optimizes bandwidth) and gives predictions on test points:
```{r}
fit_kern_Reg_GCV_parallel = function(x_test,x_train,y_train,h_min=0.1,h_max = 2,...){
  # x_eval -> a n_test x d-dimensional matrix of points to evaluate the function:
  # x_train -> training points, x values
  # y_train -> training points, y values
  # h_min -> min value for bandwidth
  # h_max -> max value for bandwidth
  
  # set up parallelization:
  n_cores = detectCores() - 1
  
  # Set up parallel cluster
  cl = makeCluster(n_cores)
  registerDoParallel(cl)
  
  # first, find the optimal bandwidth:
  
  result = optimize(function(h) GCV_kern_smooth_parallel(x_train, y_train, h,...), 
                    interval = c(h_min, h_max), 
                    tol = 1e-3)
  
  # Extract optimal bandwidth
  optimal_h = result$minimum
  
  model_fit = fit_kern_Reg_parallel(x_test,x_train,y_train,optimal_h,...)
  fhat = model_fit$predictions
  optimal_H_mat = model_fit$H
  
  stopCluster(cl)
  result = list(
    predictions = fhat,
    h_opt = optimal_h,
    H_opt = optimal_H_mat
  )
  
  return(result)
}

```

# A function with a jump 

Create a multivariate non-smooth function to check the curse of dimensionality.

```{r}
y_function = function(x){
  y = 0.5*(x>-1&x<2) + 0.5 + 0.1*(x>=2)  #+ cos(-x%*%b)
  return(y)
}
```

Draw and visualize a realization for $p = 1$:

```{r}
set.seed(1234)
p = 1
X = matrix(seq(-5,5,0.01),ncol = 1)
y = y_function(X) + rnorm(dim(X)[1],sd = 0.2)
y_true = y_function(X)
tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")+
  theme_minimal()
```
# Fit the function:

### Kernel Ridge Regression:

Fit by KRR:


```{python}
param_distributions_Gaussian = {
  "alpha": uniform(1e-9, 1),
  "kernel__length_scale": uniform(1e-9, 1.5),
}

KRR_CV_Gaussian = RandomizedSearchCV(
    KernelRidge(kernel = RBF()),
    param_distributions=param_distributions_Gaussian,
    n_iter=500,
    n_jobs=-1
    )
_ = KRR_CV_Gaussian.fit(r.X,r.y) # fit the kernel ridge regression using cross.validated lambda, suppresses output
#KRR_CV_Gaussian.best_params_
y_hat_KRR_Gaussian = KRR_CV_Gaussian.predict(r.X)
K = RBF(length_scale = KRR_CV_Gaussian.best_params_['kernel__length_scale'],length_scale_bounds = "fixed")
K_mat = K(r.X)
inv = np.linalg.inv(K_mat + KRR_CV_Gaussian.best_params_['alpha']*np.eye(K_mat.shape[0])) 
weight_matrix_Gaussian = K_mat@ inv # weight matrix
```

Visualize:

```{r}
y_hat_KRR_Gaussian = as.numeric(py$y_hat_KRR_Gaussian)

tibble(value = c(y_true,y_hat_KRR_Gaussian),
       X_grid = rep(X,2),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,2), y = rep(y,2))) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red")) +
  theme_minimal()
```

### Regression Forest:

```{r}
reg_forest = regression_forest(X,y, tune.parameters = "all")
y_hat_rf = predict(reg_forest, X)$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf),
       X_grid = rep(X,3),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,3), y = rep(y,3))) +
  geom_point(alpha = 0.1)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green")) +
  theme_minimal()
```

### Nadayara-Watson:
```{r}
fit_model = fit_kern_Reg_GCV_parallel(X,X,y,0.01,1)
y_hat_np = fit_model$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np),
       X_grid = rep(X,4),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,4), y = rep(y,4))) +
  geom_point(alpha = 0.07)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple")) +
  theme_minimal()+
  labs(x = "x", y = "y")
```

### XG Boost

Set up:

```{r}
# Convert to DMatrix object
dtrain = xgb.DMatrix(data = as.matrix(X), label = y)
#dtest = xgb.DMatrix(data = as.matrix(test_features), label = test_targets)

# Define model parameters
params = list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eta = 0.85, # Equivalent to learning_rate
  max_depth = 6, # Need to specify a value as XGBoost requires a numerical value
  min_child_weight = 100, # Not a direct equivalent but serves to control over-fitting
  subsample = 1,
  colsample_bytree = 1, # Equivalent to 'sqrt' in max_features
  # Note: XGBoost does not have a direct equivalent for 'max_leaf_nodes' and 'init'
  lambda = 100
)

# Number of boosting rounds (equivalent to n_estimators)
nrounds = 50

# Train the model
model = xgb.train(params = params, data = dtrain, nrounds = nrounds)
```

Get predictions and smoother matrix:

```{r, results = 'hide', message = FALSE}
leaf_indices_train = predict(model, dtrain, predleaf = TRUE)
smoother_train_XG = create_S_from_gbtregressor(model,leaf_indices_train,output_dir,save_output = FALSE)
```

Plot the prediction:

```{r}
y_hat_xg=predict(model, dtrain)

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np,y_hat_xg),
       X_grid = rep(X,5),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) , rep("XGBoost", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,5), y = rep(y,5))) +
  geom_point(alpha = 0.03)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple", "XGBoost" = "orange")) +
  theme_minimal()
```

# Equivalent kernels:

Set some points where we want the equivalent kernels:

```{r}
points = c(-4,-3,-1.05,0.01,1.95,4)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")

for (i in 1:length(points)){
  plot = plot+geom_vline(xintercept = points[i], linetype = "dashed")
  
}   
plot + theme_minimal()
```

### Extract the weights

Extract the equivalent kernels/weights applied at these points:

```{r}
# KRR
results_weights_KRR = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_KRR) = rep("Temp",length(points))

# Regression Forest
results_weights_RF = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_RF) = rep("Temp",length(points))

# Nadayara-Watson:
results_weights_NW = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_NW) = rep("Temp",length(points))

# XG Boost:
results_weights_XG = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_XG) = rep("Temp",length(points))


weights_KRR_Gaussian = py$weight_matrix_Gaussian



for (i in 1:length(points)){
  # KRR
  results_weights_KRR[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_KRR)[i] = paste0("x=",points[i])
  # RF:
  results_weights_RF[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points[i])),ncol = 1)
  colnames(results_weights_RF)[i] = paste0("x=",points[i])
    # NW:
  results_weights_NW[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_NW)[i] = paste0("x=",points[i])
    # XG
  results_weights_XG[,i] = smoother_train_XG[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_XG)[i] = paste0("x=",points[i])
  
}

```


##### Boundary adaptiveness

```{r}
points_boundary = c(-4.99,-4.5,-4,-3.5)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(x = X[1:round(length(y)/2),],y = y[1:round(length(X)/2)]) %>% 
  ggplot(aes(x = x, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = x, y = y_true[1:round(length(X)/2)]), color = "black")

for (i in 1:length(points_boundary)){
  plot = plot+geom_vline(xintercept = points_boundary[i], linetype = "dashed")
  
}   
plot + theme_minimal()
```

Extract the equivalent kernels/weights applied at these points:

```{r}
# KRR
results_weights_KRR_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_KRR_boundary) = rep("Temp",length(points_boundary))

# Regression Forest
results_weights_RF_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_RF_boundary) = rep("Temp",length(points_boundary))

# Nadayara-Watson:
results_weights_NW_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_NW_boundary) = rep("Temp",length(points_boundary))

# XGBoost:
results_weights_XG_boundary = matrix(NA,nrow = length(X), ncol = length(points_boundary))
colnames(results_weights_XG_boundary) = rep("Temp",length(points_boundary))


weights_KRR_Gaussian = py$weight_matrix_Gaussian

for (i in 1:length(points_boundary)){
  # KRR
  results_weights_KRR_boundary[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_KRR_boundary)[i] = paste0("x=",points_boundary[i])
  # RF:
  results_weights_RF_boundary[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points_boundary[i])),ncol = 1)
  colnames(results_weights_RF_boundary)[i] = paste0("x=",points_boundary[i])
  # NW:
  results_weights_NW_boundary[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_NW_boundary)[i] = paste0("x=",points_boundary[i])
  # XG
  results_weights_XG_boundary[,i] = smoother_train_XG[which(abs((as.numeric(X)-points_boundary[i]))<0.000001),]
  colnames(results_weights_XG_boundary)[i] = paste0("x=",points[i])
  
}

```


### Visualize KRR

Visualize for KRR:


```{r}
results_weights_KRR = as_tibble(results_weights_KRR)
results_weights_KRR = results_weights_KRR %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

##### Boundary

Plot:

```{r}
results_weights_KRR_boundary = as_tibble(results_weights_KRR_boundary)
results_weights_KRR_boundary = results_weights_KRR_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

### Visualize RF

Visualize for regression forest:


```{r}
results_weights_RF = as_tibble(results_weights_RF)
results_weights_RF = results_weights_RF %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

##### Boundary

```{r}
results_weights_RF_boundary = as_tibble(results_weights_RF_boundary)
results_weights_RF_boundary = results_weights_RF_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
### Visualize NW:

```{r}
results_weights_NW = as_tibble(results_weights_NW)
results_weights_NW = results_weights_NW %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

##### Boundary

```{r}
results_weights_NW_boundary = as_tibble(results_weights_NW_boundary)
results_weights_NW_boundary = results_weights_NW_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
### Visualize XG

```{r}
results_weights_XG = as_tibble(results_weights_XG)
results_weights_XG = results_weights_XG %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
##### Boundary

```{r}
results_weights_XG_boundary = as_tibble(results_weights_XG_boundary)
results_weights_XG_boundary = results_weights_XG_boundary %>% slice_head(n = round(length(y)/2)) %>% mutate(X = as.numeric(X[1:round(length(y)/2),])) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG_boundary %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

# New Function, just a line

Create a "simple" function:

```{r}
y_function = function(x){
  y = 0.5*x
  return(y)
}
```

Draw and visualize a realization for $p = 1$. Observations scattered unevenly along the $x$-axis:

```{r}
set.seed(1234)
p = 1
X = matrix(c(runif(500,-5,-2),runif(50,-2,1),runif(1000,1,3),runif(100,3,5)),ncol = 1)
y = y_function(X) + rnorm(dim(X)[1],sd = 0.2)
y_true = y_function(X)
tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")+
  theme_minimal()
```

# Fit the function:

### Kernel Ridge Regression:

Fit by KRR:


```{python}
param_distributions_Gaussian = {
  "alpha": uniform(1e-9, 1),
  "kernel__length_scale": uniform(1e-9, 1.5),
}

KRR_CV_Gaussian = RandomizedSearchCV(
    KernelRidge(kernel = RBF()),
    param_distributions=param_distributions_Gaussian,
    n_iter=500,
    n_jobs=-1
    )
_ = KRR_CV_Gaussian.fit(r.X,r.y) # fit the kernel ridge regression using cross.validated lambda, suppresses output
#KRR_CV_Gaussian.best_params_
y_hat_KRR_Gaussian = KRR_CV_Gaussian.predict(r.X)
K = RBF(length_scale = KRR_CV_Gaussian.best_params_['kernel__length_scale'],length_scale_bounds = "fixed")
K_mat = K(r.X)
inv = np.linalg.inv(K_mat + KRR_CV_Gaussian.best_params_['alpha']*np.eye(K_mat.shape[0])) 
weight_matrix_Gaussian = K_mat@ inv # weight matrix
```

Visualize:

```{r}
y_hat_KRR_Gaussian = as.numeric(py$y_hat_KRR_Gaussian)

tibble(value = c(y_true,y_hat_KRR_Gaussian),
       X_grid = rep(X,2),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,2), y = rep(y,2))) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red")) +
  theme_minimal()
```

### Regression Forest:

```{r}
reg_forest = regression_forest(X,y, tune.parameters = "all")
y_hat_rf = predict(reg_forest, X)$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf),
       X_grid = rep(X,3),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1])))) %>% 
  ggplot(aes(x = rep(X,3), y = rep(y,3))) +
  geom_point(alpha = 0.1)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green")) +
  theme_minimal()
```

### Nadayara-Watson
```{r}
fit_model = fit_kern_Reg_GCV_parallel(X,X,y,0.01,1)
y_hat_np = fit_model$predictions

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np),
       X_grid = rep(X,4),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,4), y = rep(y,4))) +
  geom_point(alpha = 0.07)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple")) +
  theme_minimal()
```
### XG Boost

Set up:

```{r}
# Convert to DMatrix object
dtrain = xgb.DMatrix(data = as.matrix(X), label = y)
#dtest = xgb.DMatrix(data = as.matrix(test_features), label = test_targets)

# Define model parameters
params = list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eta = 0.85, # Equivalent to learning_rate
  max_depth = 6, # Need to specify a value as XGBoost requires a numerical value
  min_child_weight = 50, # Not a direct equivalent but serves to control over-fitting
  subsample = 1,
  colsample_bytree = 1, # Equivalent to 'sqrt' in max_features
  # Note: XGBoost does not have a direct equivalent for 'max_leaf_nodes' and 'init'
  lambda = 1
)

# Number of boosting rounds (equivalent to n_estimators)
nrounds = 50

# Train the model
model = xgb.train(params = params, data = dtrain, nrounds = nrounds)
```

Get predictions and smoother matrix:

```{r, results = 'hide', message = FALSE}
leaf_indices_train = predict(model, dtrain, predleaf = TRUE)
smoother_train_XG = create_S_from_gbtregressor(model,leaf_indices_train,output_dir,save_output = FALSE)
```

Plot the prediction:

```{r}
y_hat_xg=predict(model, dtrain)

tibble(value = c(y_true,y_hat_KRR_Gaussian,y_hat_rf,y_hat_np,y_hat_xg),
       X_grid = rep(X,5),
       Method = factor(c(rep("True Function", dim(X)[1]),rep("KRR", dim(X)[1]), rep("Random Forest", dim(X)[1]), rep("NW", dim(X)[1]) , rep("XGBoost", dim(X)[1])    ))) %>% 
  ggplot(aes(x = rep(X,5), y = rep(y,5))) +
  geom_point(alpha = 0.03)+
  geom_line(aes(x = X_grid, y = value, color = Method))+
  scale_color_manual(values = c("True Function" = "black", "KRR" = "red", "Random Forest" = "green", "NW" = "purple", "XGBoost" = "orange")) +
  theme_minimal()
```
# Equivalent kernels

Set some points where we want the equivalent kernels:

```{r}
points = quantile(X,probs = c(0.1,0.2,0.31,0.32,0.35,0.7,0.98), type = 1)
#points = c(-4,-3.5,-2,-1.05,0.01,1.5,1.95,2.5,4)
plot = tibble(X,y) %>% 
  ggplot(aes(x = X, y = y)) +
  geom_point(alpha = 0.2)+
  geom_line(aes(x = X, y = y_true), color = "black")

for (i in 1:length(points)){
  plot = plot+geom_vline(xintercept = points[i], linetype = "dashed")
  
}   
plot + theme_minimal()
```

### Extract the weights:

Extract the equivalent kernels/weights applied at these points:

```{r}
# KRR
results_weights_KRR = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_KRR) = rep("Temp",length(points))

# Regression Forest
results_weights_RF = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_RF) = rep("Temp",length(points))

# Nadayara-Watson:
results_weights_NW = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_NW) = rep("Temp",length(points))

# XG Boost:
results_weights_XG = matrix(NA,nrow = length(X), ncol = length(points))
colnames(results_weights_XG) = rep("Temp",length(points))

weights_KRR_Gaussian = py$weight_matrix_Gaussian

for (i in 1:length(points)){
  # KRR
  results_weights_KRR[,i] = weights_KRR_Gaussian[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_KRR)[i] = paste0("x=",points[i])
  # RF:
  results_weights_RF[,i] = matrix(get_forest_weights(reg_forest,as.matrix(points[i])),ncol = 1)
  colnames(results_weights_RF)[i] = paste0("x=",points[i])
  # NW:
  results_weights_NW[,i] = fit_model$H_opt[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_NW)[i] = paste0("x=",points[i])
  # XG
  results_weights_XG[,i] = smoother_train_XG[which(abs((as.numeric(X)-points[i]))<0.000001),]
  colnames(results_weights_XG)[i] = paste0("x=",points[i])
  
}

```

### Visualize KRR

Visualize for KRR:


```{r}
results_weights_KRR = as_tibble(results_weights_KRR)
results_weights_KRR = results_weights_KRR %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_KRR %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
### Visualize RF

Visualize for KRR:


```{r}
results_weights_RF = as_tibble(results_weights_RF)
results_weights_RF = results_weights_RF %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_RF %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```
### Visualize NW

```{r}
results_weights_NW = as_tibble(results_weights_NW)
results_weights_NW = results_weights_NW %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_NW %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

### Visualize XG

```{r}
results_weights_XG = as_tibble(results_weights_XG)
results_weights_XG = results_weights_XG %>% mutate(X = as.numeric(X)) %>% pivot_longer(cols = starts_with("x="), names_to = "x", names_prefix = "x=", values_to = "weight") %>% mutate(x = as_factor(x))

results_weights_XG %>%
  ggplot(aes(x = X, y = weight, color = x))+
  geom_line()+
  theme_minimal()
```

# References

<div id="refs"></div>